Most Common Mutations in Each Cancer Type
(and Pan Cancer)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#>
#> between, first, last
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ forcats 1.0.0 ✔ readr 2.1.5
#> ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
#> ✔ lubridate 1.9.3 ✔ tibble 3.2.1
#> ✔ purrr 1.0.2 ✔ tidyr 1.3.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::between() masks data.table::between()
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::first() masks data.table::first()
#> ✖ lubridate::hour() masks data.table::hour()
#> ✖ lubridate::isoweek() masks data.table::isoweek()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ dplyr::last() masks data.table::last()
#> ✖ lubridate::mday() masks data.table::mday()
#> ✖ lubridate::minute() masks data.table::minute()
#> ✖ lubridate::month() masks data.table::month()
#> ✖ lubridate::quarter() masks data.table::quarter()
#> ✖ lubridate::second() masks data.table::second()
#> ✖ purrr::transpose() masks data.table::transpose()
#> ✖ lubridate::wday() masks data.table::wday()
#> ✖ lubridate::week() masks data.table::week()
#> ✖ lubridate::yday() masks data.table::yday()
#> ✖ lubridate::year() masks data.table::year()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cancer_data <- as.data.frame(tcga_available())
cancer_dfs <- tibble(
cancer_name = character(),
data = list()
)
cancer_dfs_facet <- tibble(
cancer_name = character(),
data = list()
)
for (i in seq_along(cancer_data$Study_Abbreviation)) {
# Load TCGA mutation data
cancer_type <- cancer_data$Study_Abbreviation[i]
cancer_name <- gsub("_", " ", cancer_data$Study_Name[i])
mutations <- tcgaLoad(study = cancer_type)
mutation_data <- as.data.frame(mutations@data)
nucleotide_changes <- mutation_data[, c("Hugo_Symbol", "HGVSc")]
tumor_samples <- mutation_data[, c("Tumor_Sample_Barcode")]
num_tumors <- length(unique(tumor_samples))
# group_by HGVSc and add frequency using mutate, then ungroup
df <- nucleotide_changes %>%
group_by(HGVSc) %>%
mutate(freq = n()) %>%
ungroup()
# sort in descending order, distinct removes duplicate rows
df_sorted <- df %>%
arrange(desc(freq)) %>%
distinct(HGVSc, .keep_all = TRUE)
# Merge gene name and HGVSc into one column
df_merged <- df_sorted %>%
mutate(mutation = paste(Hugo_Symbol, HGVSc, sep = ", ")) %>%
select(mutation, freq) %>%
mutate(cancer_name_internal = cancer_name)
df_merged_trunc <- df_merged[1:50, ]
df_merged_trunc_facet <- df_merged[1:20, ]
# Convert the mutation column to a factor
df_merged_trunc$mutation <- factor(df_merged_trunc$mutation, levels = df_merged$mutation)
df_merged_trunc$freq <- df_merged_trunc$freq / num_tumors
df_merged_trunc_facet$mutation <- factor(df_merged_trunc_facet$mutation, levels = df_merged$mutation)
df_merged_trunc_facet$freq <- df_merged_trunc_facet$freq / num_tumors
# append dfs to list
cancer_dfs <- cancer_dfs %>% add_row(cancer_name, data = list(df_merged_trunc))
cancer_dfs_facet <- cancer_dfs_facet %>% add_row(cancer_name, data = list(df_merged_trunc_facet))
}
#> Loading ACC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading BLCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading BRCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading CESC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading CHOL. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading COAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading DLBC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading ESCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading GBM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading HNSC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KICH. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KIRC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KIRP. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LAML. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LGG. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LIHC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LUAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LUSC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading MESO. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading OV. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PAAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PCPG. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PRAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading READ. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading SARC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading SKCM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading STAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading TGCT. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading THCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading THYM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UCEC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UCS. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UVM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
library(ggplot2)
library(purrr)
library(gridExtra)
#>
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:dplyr':
#>
#> combine
# Create the bar plot
create_plot <- function(df, name) {
ggplot(data = df, aes(x = mutation, y = freq)) + # nolint: object_usage_linter.
geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = round(freq, 3)), size = 3.5, hjust = 1.2, color = "white") +
coord_flip() +
labs(x = "Mutation", y = "Frequency", title = paste("Frequency of Mutations in ", name, sep = "")) +
theme_minimal()
}
plots <- pmap(cancer_dfs, function(cancer_name, data) {
create_plot(data, cancer_name)
})
plots
#> [[1]]

#>
#> [[2]]

#>
#> [[3]]

#>
#> [[4]]

#>
#> [[5]]

#>
#> [[6]]

#>
#> [[7]]

#>
#> [[8]]

#>
#> [[9]]

#>
#> [[10]]

#>
#> [[11]]

#>
#> [[12]]

#>
#> [[13]]

#>
#> [[14]]

#>
#> [[15]]

#>
#> [[16]]

#>
#> [[17]]

#>
#> [[18]]

#>
#> [[19]]

#>
#> [[20]]

#>
#> [[21]]

#>
#> [[22]]

#>
#> [[23]]

#>
#> [[24]]

#>
#> [[25]]

#>
#> [[26]]

#>
#> [[27]]

#>
#> [[28]]

#>
#> [[29]]

#>
#> [[30]]

#>
#> [[31]]

#>
#> [[32]]

#>
#> [[33]]

print(cancer_dfs_facet$data[3])
#> [[1]]
#> # A tibble: 20 × 3
#> mutation freq cancer_name_internal
#> <fct> <dbl> <chr>
#> 1 PIK3CA, c.3140A>G 0.128 Breast invasive carcinoma
#> 2 PIK3CA, c.1633G>A 0.0673 Breast invasive carcinoma
#> 3 PIK3CA, c.1624G>A 0.0526 Breast invasive carcinoma
#> 4 OR6C4, c.3G>A 0.0409 Breast invasive carcinoma
#> 5 HIF3A, c.49G>A 0.0390 Breast invasive carcinoma
#> 6 NLN, c.304G>A 0.0292 Breast invasive carcinoma
#> 7 TP53, c.524G>A 0.0292 Breast invasive carcinoma
#> 8 CUL3, c.274G>A 0.0273 Breast invasive carcinoma
#> 9 FHL1, c.550G>A 0.0273 Breast invasive carcinoma
#> 10 DCTPP1, c.232G>A 0.0263 Breast invasive carcinoma
#> 11 HMGB2, c.97G>A 0.0263 Breast invasive carcinoma
#> 12 CHRNB4, c.664G>A 0.0263 Breast invasive carcinoma
#> 13 FZD2, c.364G>A 0.0253 Breast invasive carcinoma
#> 14 ZSCAN1, c.85G>A 0.0253 Breast invasive carcinoma
#> 15 PRKCE, c.256G>A 0.0253 Breast invasive carcinoma
#> 16 TRIM65, c.466G>A 0.0244 Breast invasive carcinoma
#> 17 BEST3, c.220G>A 0.0244 Breast invasive carcinoma
#> 18 RNASE12, c.211G>A 0.0244 Breast invasive carcinoma
#> 19 ZBED4, c.244G>A 0.0244 Breast invasive carcinoma
#> 20 TRAV38-1, c.301G>A 0.0244 Breast invasive carcinoma
# print(cancer_dfs_facet$cancer_name)
library(tidyverse)
# Split df into 4 because too big for one facet plot
cancer_types <- unique(cancer_dfs_facet$cancer_name)
plots_per_group <- 4
n_groups <- ceiling(length(cancer_types) / plots_per_group)
group_assignments <- rep(1:n_groups, each = plots_per_group)[seq_along(cancer_types)]
cancer_type_groups <- split(cancer_types, group_assignments)
create_facet_plot <- function(df_data, cancer_types_subset) {
subset_data <- df_data %>% filter(cancer_name %in% cancer_types_subset) # nolint: object_usage_linter.
list_of_tibbles <- subset_data$data
combined_data <- bind_rows(list_of_tibbles)
# Create plot
ggplot(combined_data, aes(x = mutation, y = freq)) + # nolint: object_usage_linter.
geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = round(freq, 3)), size = 3.5, hjust = 1.2, color = "white") +
coord_flip() +
labs(x = "Mutation", y = "Frequency") +
facet_wrap(~cancer_name_internal, scales = "free") +
theme_minimal()
}
# Create facet plots for each group of cancer types
facet_plots <- lapply(cancer_type_groups, function(cancer_types_subset) {
create_facet_plot(cancer_dfs_facet, cancer_types_subset)
})
for (i in seq_along(facet_plots)) {
print(facet_plots[[i]])
}









# Get all cancer types
cancer_types <- unique(cancer_data$Study_Abbreviation)
# Function to load mutations individually because TCGAmutations does not support loading all
load_mutations <- function(cancer_type) {
mutations <- tcgaLoad(study = cancer_type) # nolint: object_usage_linter.
mutation_data <- as.data.frame(mutations@data)
mutation_data$cancer_type <- cancer_type
return(mutation_data)
}
pan_cancer_list <- lapply(cancer_types, load_mutations)
#> Loading ACC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading BLCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading BRCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading CESC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading CHOL. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading COAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading DLBC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading ESCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading GBM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading HNSC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KICH. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KIRC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KIRP. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LAML. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LGG. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LIHC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LUAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LUSC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading MESO. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading OV. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PAAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PCPG. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PRAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading READ. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading SARC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading SKCM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading STAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading TGCT. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading THCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading THYM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UCEC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UCS. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UVM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
pan_cancer_df <- bind_rows(pan_cancer_list)
#> New names:
#> • `IMPACT` -> `IMPACT...19`
#> • `IMPACT` -> `IMPACT...21`
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> • `IMPACT` -> `IMPACT...19`
#> • `IMPACT` -> `IMPACT...21`
head(pan_cancer_df)
#> Hugo_Symbol Chromosome Start_Position End_Position Variant_Classification
#> 1 OPN4 10 88419681 88419681 Missense_Mutation
#> 2 KLRB1 12 9760409 9760409 Missense_Mutation
#> 3 SALL2 14 21991730 21991730 Missense_Mutation
#> 4 C15orf27 15 76467904 76467904 Frame_Shift_Del
#> 5 KLHDC4 16 87742934 87742934 Missense_Mutation
#> 6 NOL11 17 65734087 65734087 Missense_Mutation
#> Variant_Type Reference_Allele Tumor_Seq_Allele2 Tumor_Sample_Barcode
#> 1 SNP G A TCGA-OR-A5J1-01A-11D-A29I-10
#> 2 SNP C G TCGA-OR-A5J1-01A-11D-A29I-10
#> 3 SNP C T TCGA-OR-A5J1-01A-11D-A29I-10
#> 4 DEL C - TCGA-OR-A5J1-01A-11D-A29I-10
#> 5 SNP C T TCGA-OR-A5J1-01A-11D-A29I-10
#> 6 SNP A C TCGA-OR-A5J1-01A-11D-A29I-10
#> Matched_Norm_Sample_Barcode HGVSc HGVSp_Short Transcript_ID
#> 1 TCGA-OR-A5J1-10A-01D-A29L-10 c.863G>A p.G288D ENST00000372071
#> 2 TCGA-OR-A5J1-10A-01D-A29L-10 c.27G>C p.E9D ENST00000229402
#> 3 TCGA-OR-A5J1-10A-01D-A29L-10 c.2132G>A p.R711Q ENST00000327430
#> 4 TCGA-OR-A5J1-10A-01D-A29L-10 c.657delC p.Y219* ENST00000388942
#> 5 TCGA-OR-A5J1-10A-01D-A29L-10 c.1384G>A p.D462N ENST00000270583
#> 6 TCGA-OR-A5J1-10A-01D-A29L-10 c.1528A>C p.S510R ENST00000253247
#> Exon_Number t_ref_count t_alt_count n_ref_count n_alt_count IMPACT...19
#> 1 7/11 133 10 140 0 MODERATE
#> 2 1/6 166 113 186 0 MODERATE
#> 3 2/2 54 47 78 1 MODERATE
#> 4 8/11 121 70 113 0 HIGH
#> 5 10/12 122 45 112 0 MODERATE
#> 6 13/18 29 24 50 0 MODERATE
#> ExAC_AF IMPACT...21 FILTER Tumor_Sample_Barcode_min cancer_type
#> 1 . MODERATE PASS TCGA-OR-A5J1 ACC
#> 2 . MODERATE PASS TCGA-OR-A5J1 ACC
#> 3 . MODERATE PASS TCGA-OR-A5J1 ACC
#> 4 . HIGH PASS TCGA-OR-A5J1 ACC
#> 5 . MODERATE PASS TCGA-OR-A5J1 ACC
#> 6 . MODERATE PASS TCGA-OR-A5J1 ACC
nucleotide_changes <- pan_cancer_df[, c("Hugo_Symbol", "HGVSc", "cancer_type")]
tumor_samples <- pan_cancer_df[, c("Tumor_Sample_Barcode")]
num_tumors <- length(unique(tumor_samples))
print(length(tumor_samples))
#> [1] 2147998
print(num_tumors)
#> [1] 10201
# group_by HGVSc and add frequency using mutate, then ungroup
df <- nucleotide_changes %>%
group_by(HGVSc) %>%
mutate(freq = n()) %>%
ungroup()
# sort in descending order, distinct removes duplicate rows
df_sorted <- df %>%
arrange(desc(freq)) %>%
distinct(HGVSc, .keep_all = TRUE)
# Merge gene name and HGVSc into one column
df_merged <- df_sorted %>%
mutate(mutation = paste(Hugo_Symbol, HGVSc, sep = ", ")) %>%
select(mutation, freq, cancer_type)
df_merged_trunc <- df_merged[1:50, ]
# Convert the mutation column to a factor
df_merged_trunc$mutation <- factor(df_merged_trunc$mutation, levels = df_merged$mutation)
df_merged_trunc$freq <- df_merged_trunc$freq / num_tumors
df_merged_trunc
#> # A tibble: 50 × 3
#> mutation freq cancer_type
#> <fct> <dbl> <chr>
#> 1 RBMXL2, c.3G>A 0.0863 BLCA
#> 2 COL5A2, c.395G>A 0.0717 ACC
#> 3 OR4K15, c.3G>T 0.0600 ACC
#> 4 BRAF, c.1799T>A 0.0560 BLCA
#> 5 ALG10, c.4G>A 0.0548 ACC
#> 6 MROH5, c.145G>A 0.0539 ACC
#> 7 GRIK5, c.331G>A 0.0528 BLCA
#> 8 ZNF598, c.397G>A 0.0523 ACC
#> 9 PTH, c.226G>A 0.0516 BLCA
#> 10 OGG1, c.346G>A 0.0513 ACC
#> # ℹ 40 more rows
pancan_plot <- ggplot(data = df_merged_trunc, aes(x = mutation, y = freq)) +
geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = round(freq, 3)), size = 3.5, hjust = 1.2, color = "white") +
coord_flip() +
labs(x = "Mutation", y = "Frequency", title = "Pan-cancer Frequency of Mutations") +
theme_minimal()
pancan_plot
